The competition dataset is based on the 2016 NYC Yellow Cab trip record data made available in Big Query on Google Cloud Platform. The data was originally published by the NYC Taxi and Limousine Commission (TLC). The data was sampled and cleaned for the purposes of this playground competition. Based on individual trip attributes, participants should predict the duration of each trip in the test set.
File descriptions
Data fields
Disclaimer: The decision was made to not remove dropoff coordinates from the dataset order to provide an expanded set of variables to use in Kernels.
import pandas as pd #pandas for using dataframe and reading csv
import numpy as np #numpy for vector operations and basic maths
#import simplejson #getting JSON in simplified format
import urllib #for url stuff
#import gmaps #for using google maps to visulalize places on maps
import re #for processing regular expressions
import datetime #for datetime operations
import calendar #for calendar for datetime operations
import matplotlib.pyplot as plt # for plotting basic graphs and pie charts
import time #to get the system time
import scipy #for other dependancies
from sklearn.cluster import KMeans # for doing K-means clustering
from haversine import haversine # for calculating haversine distance
import math #for basic maths operations
import seaborn as sns #for making plots
import matplotlib.pyplot as plt # for plotting
import os # for os commands
from scipy.misc import imread, imresize, imsave # for plots
import plotly.plotly as py
import plotly.graph_objs as go
import plotly
plotly.offline.init_notebook_mode() # run at the start of every ipython notebook
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
%matplotlib inline
%config InlineBackend.figure_format = 'retina'
# Creating the INPUT FOLDER which will contain our input files!
INPUT_FOLDER='/Users/as186194/Documents/DOCUMENTS/TRIALS/Kaggle/Kaggle_NYC_Taxi/'
print ('File Sizes:')
for f in os.listdir(INPUT_FOLDER):
if 'zip' not in f:
print (f.ljust(30) + str(round(os.path.getsize(INPUT_FOLDER + f) / 1000000, 2)) + ' MB')
train_df=pd.read_csv(INPUT_FOLDER + 'train.csv')
test_df=pd.read_csv(INPUT_FOLDER + 'test.csv')
train_df.head()
train_df.shape
train_df.columns
test_df.head()
test_df.shape
test_df.columns
vendor=train_df.groupby("vendor_id").size()
colors = ['gold', 'lightskyblue']
plt.pie(vendor, shadow = True, colors = colors, labels = ['Vendor 1', 'Vendor 2'],
autopct='%1.1f%%')
plt.title('Cab Vendors')
duration=train_df.groupby('trip_duration').size()
duration.hist(bins=50,figsize=(10,8))
duration.plot(kind='bar')
plt.show()
sns.set(style="white", palette="muted", color_codes=True)
f, axes = plt.subplots(figsize=(11, 7), sharex=True)
sns.despine(left=True)
sns.distplot((train_df['trip_duration'].values+1),
axlabel = 'trip_duration', label = 'trip_duration', bins=50)
plt.setp(axes, yticks=[])
plt.tight_layout()
sns.set(style="white", palette="muted", color_codes=True)
f, axes = plt.subplots(1, 1, figsize=(11, 7), sharex=True)
sns.despine(left=True)
sns.distplot(np.log(train_df['trip_duration'].values+1),
axlabel = 'Log(trip_duration)', label = 'log(trip_duration)', bins = 50, color="g")
plt.setp(axes, yticks=[])
plt.tight_layout()
end = time.time()
Findings - The above histrogram and kernel density plot that the trip-durations are like gaussian and few trips have very large duration, like ~350000 seconds which is 100 hours, while most of the trips are:
Probably trips are taken inside Manhattan or in New York only.
sns.set(style="white", palette="muted", color_codes=True)
f, axes = plt.subplots(2,2,figsize=(10, 10), sharex=False, sharey = False)
sns.despine(left=True)
sns.distplot(train_df['pickup_latitude'].values, label = 'pickup_latitude',color="m",bins = 100, ax=axes[0,0])
sns.distplot(train_df['pickup_longitude'].values, label = 'pickup_longitude',color="m",bins =100, ax=axes[0,1])
sns.distplot(train_df['dropoff_latitude'].values, label = 'dropoff_latitude',color="m",bins =100, ax=axes[1, 0])
sns.distplot(train_df['dropoff_longitude'].values, label = 'dropoff_longitude',color="m",bins =100, ax=axes[1, 1])
plt.setp(axes, yticks=[])
plt.tight_layout()
plt.show()
Findings - From the plot above it is clear that pick and drop latitude are centered around 40 to 41, and longitude are situated around -74 to -73.
We are not getting any histogram kind of plots when we are plotting lat - long as the distplot frunction of sns is getting affacted by outliers, trips which are very far from each other like lat 32 to lat 44, are taking very long time, and affacted this plot such that it is coming of as a spike.
Let's remove those large duration trip by using a cap on lat-long and visulaize the distributions of latitude and longitude given to us.
df = train_df.loc[(train_df.pickup_latitude > 40.6) & (train_df.pickup_latitude < 40.9)]
df = df.loc[(df.dropoff_latitude>40.6) & (df.dropoff_latitude < 40.9)]
df = df.loc[(df.dropoff_longitude > -74.05) & (df.dropoff_longitude < -73.7)]
df = df.loc[(df.pickup_longitude > -74.05) & (df.pickup_longitude < -73.7)]
train_data = df.copy()
sns.set(style="white", palette="muted", color_codes=True)
f, axes = plt.subplots(2,2,figsize=(12, 12), sharex=False, sharey = False)#
sns.despine(left=True)
sns.distplot(train_data['pickup_latitude'].values, label = 'pickup_latitude',color="m",bins = 100, ax=axes[0,0])
sns.distplot(train_data['pickup_longitude'].values, label = 'pickup_longitude',color="g",bins =100, ax=axes[0,1])
sns.distplot(train_data['dropoff_latitude'].values, label = 'dropoff_latitude',color="m",bins =100, ax=axes[1, 0])
sns.distplot(train_data['dropoff_longitude'].values, label = 'dropoff_longitude',color="g",bins =100, ax=axes[1, 1])
plt.setp(axes, yticks=[])
plt.tight_layout()
print(df.shape[0], train_data.shape[0])
plt.show()
As we put the following caps on lat-long -
We get that the distribution spikes becomes as distribution in distplot (distplot is a histrogram plot in seaborn package), we can see that most of the trips are getting concentrated between these lat-long only.
test_data=test_df.copy()
train_data['pickup_datetime'] = pd.to_datetime(train_data.pickup_datetime)
train_data.loc[:, 'pick_date'] = train_data['pickup_datetime'].dt.date
train_data.head()
test_data['pickup_datetime'] = pd.to_datetime(test_data.pickup_datetime)
test_data.loc[:, 'pick_date'] = test_data['pickup_datetime'].dt.date
test_data.head()
train_data['pickup_datetime'] = pd.to_datetime(train_data.pickup_datetime)
train_data.loc[:, 'pick_month'] = train_data['pickup_datetime'].dt.month
train_data.loc[:, 'hour'] = train_data['pickup_datetime'].dt.hour
train_data.loc[:, 'week_of_year'] = train_data['pickup_datetime'].dt.weekofyear
train_data.loc[:, 'day_of_year'] = train_data['pickup_datetime'].dt.dayofyear
train_data.loc[:, 'day_of_week'] = train_data['pickup_datetime'].dt.dayofweek
train_data.head()
train_data['dropoff_datetime'] = pd.to_datetime(train_data.dropoff_datetime)
train_data.loc[:, 'drop_month'] = train_data['dropoff_datetime'].dt.month
train_data.loc[:, 'drop_hour'] = train_data['dropoff_datetime'].dt.hour
train_data.loc[:, 'drop_week_of_year'] = train_data['dropoff_datetime'].dt.weekofyear
train_data.loc[:, 'drop_day_of_year'] = train_data['dropoff_datetime'].dt.dayofyear
train_data.loc[:, 'drop_day_of_week'] = train_data['dropoff_datetime'].dt.dayofweek
test_data['pickup_datetime'] = pd.to_datetime(test_data.pickup_datetime)
test_data.loc[:, 'pick_month'] = test_data['pickup_datetime'].dt.month
test_data.loc[:, 'hour'] = test_data['pickup_datetime'].dt.hour
test_data.loc[:, 'week_of_year'] = test_data['pickup_datetime'].dt.weekofyear
test_data.loc[:, 'day_of_year'] = test_data['pickup_datetime'].dt.dayofyear
test_data.loc[:, 'day_of_week'] = test_data['pickup_datetime'].dt.dayofweek
test_data.head()
'''test_data['dropoff_datetime'] = pd.to_datetime(test_data.dropoff_datetime)
test_data.loc[:, 'drop_month'] = test_data['dropoff_datetime'].dt.month
test_data.loc[:, 'drop_hour'] = test_data['dropoff_datetime'].dt.hour
test_data.loc[:, 'drop_week_of_year'] = test_data['dropoff_datetime'].dt.weekofyear
test_data.loc[:, 'drop_day_of_year'] = test_data['dropoff_datetime'].dt.dayofyear
test_data.loc[:, 'drop_day_of_week'] = test_data['dropoff_datetime'].dt.dayofweek'''
feature_sel_data=train_data.copy()
feature_sel_data=feature_sel_data.drop(["pickup_datetime","dropoff_datetime","pick_date","id"], axis=1)
feature_sel_data["store_and_fwd_flag"].replace(['N','Y'],[0,1],inplace=True)
feature_sel_data.head()
trip_d=feature_sel_data['trip_duration']
feature_sel_data=feature_sel_data.drop(('trip_duration'), axis=1)
feature_sel_data["trip_duration"]=trip_d
#len(feature_sel_data.columns)
#USING RFE
from sklearn.linear_model import RandomizedLasso
A = feature_sel_data.iloc[:,0:17]
B = feature_sel_data["trip_duration"]
column_names=feature_sel_data.columns
rlasso = RandomizedLasso(alpha=0.025)
rlasso.fit(A,B)
print ("Features sorted by their score:")
print (sorted(zip(map(lambda x: round(x, 4), rlasso.scores_),
column_names), reverse=True))
USING RFE WITH LINEAR REGRESSION AND RANKING IT
from sklearn.feature_selection import RFE
from sklearn.linear_model import LinearRegression
#use linear regression as the model
lr = LinearRegression()
A = feature_sel_data.iloc[:,0:17]
B = feature_sel_data["trip_duration"]
column_names=feature_sel_data.columns
#rank all features, i.e continue the elimination until the last one
rfe = RFE(lr, n_features_to_select=None)
rfe.fit(A,B)
print ("Features sorted by their rank:")
print (sorted(zip(map(lambda x: round(x, 4), rfe.ranking_), column_names)))
booking_max_hour=feature_sel_data.groupby("hour").size()
f, (ax1, ax2) = plt.subplots(1, 2, sharey=True,figsize=(15,10))
train_data.plot(kind='scatter', x='pickup_longitude', y='pickup_latitude',
color='yellow',
s=.02, alpha=.6, subplots=True, ax=ax1)
ax1.set_title("Pickups")
ax1.set_facecolor('black')
train_data.plot(kind='scatter', x='dropoff_longitude', y='dropoff_latitude',
color='yellow',
s=.02, alpha=.6, subplots=True, ax=ax2)
ax2.set_title("Dropoffs")
ax2.set_facecolor('black')
import folium
newyork_map = folium.Map(location=[40.767937,-73.982155 ],tiles='Mapbox Bright',
zoom_start=10)
for row in train_data[:2000].iterrows():
folium.RegularPolygonMarker([row[1]['pickup_latitude'],row[1]['pickup_longitude'],row[1]['dropoff_latitude'],row[1]['dropoff_longitude']],
radius=3,
color='gold',
popup=str(row[1]['passenger_count'])+','+str(row[1]['vendor_id']),
fill_color='#FD8A6C'
).add_to(newyork_map)
newyork_map
%matplotlib inline
%config InlineBackend.figure_format = 'retina'
rgb = np.zeros((3000, 3500, 3), dtype=np.uint8)
rgb[..., 0] = 0
rgb[..., 1] = 0
rgb[..., 2] = 0
train_data['pick_lat_new'] = list(map(int, (train_data['pickup_latitude'] - (40.6000))*10000))
train_data['drop_lat_new'] = list(map(int, (train_data['dropoff_latitude'] - (40.6000))*10000))
train_data['pick_lon_new'] = list(map(int, (train_data['pickup_longitude'] - (-74.050))*10000))
train_data['drop_lon_new'] = list(map(int,(train_data['dropoff_longitude'] - (-74.050))*10000))
summary_plot = pd.DataFrame(train_data.groupby(['pick_lat_new', 'pick_lon_new'])['id'].count())
summary_plot.reset_index(inplace = True)
summary_plot.head(120)
lat_list = summary_plot['pick_lat_new'].unique()
for i in lat_list:
#print(i)
lon_list = summary_plot.loc[summary_plot['pick_lat_new']==i]['pick_lon_new'].tolist()
unit = summary_plot.loc[summary_plot['pick_lat_new']==i]['id'].tolist()
for j in lon_list:
#j = int(j)
a = unit[lon_list.index(j)]
#print(a)
if (a//50) >0:
rgb[i][j][0] = 255
rgb[i,j, 1] = 255
rgb[i,j, 2] = 0
elif (a//10)>0:
rgb[i,j, 0] = 0
rgb[i,j, 1] = 255
rgb[i,j, 2] = 0
else:
rgb[i,j, 0] = 255
rgb[i,j, 1] = 0
rgb[i,j, 2] = 0
fig, ax = plt.subplots(nrows=1,ncols=1,figsize=(14,20))
ax.imshow(rgb, cmap = 'hot')
ax.set_axis_off()
From the heatmap kind of image above -
feature_sel_data.columns
df_pick = feature_sel_data[['pickup_longitude','pickup_latitude']]
df_drop = feature_sel_data[['dropoff_longitude','dropoff_latitude']]
data_for_grouping = feature_sel_data.copy()
saff_pickup=data_for_grouping.groupby(('pickup_longitude','pickup_latitude')).size()
saff_dropoff=data_for_grouping.groupby(('dropoff_longitude','dropoff_latitude')).size()
def haversine_(lat1, lng1, lat2, lng2):
"""function to calculate haversine distance between two co-ordinates"""
lat1, lng1, lat2, lng2 = map(np.radians, (lat1, lng1, lat2, lng2))
AVG_EARTH_RADIUS = 6371 # in km
lat = lat2 - lat1
lng = lng2 - lng1
d = np.sin(lat * 0.5) ** 2 + np.cos(lat1) * np.cos(lat2) * np.sin(lng * 0.5) ** 2
h = 2 * AVG_EARTH_RADIUS * np.arcsin(np.sqrt(d))
return(h)
def manhattan_distance_pd(lat1, lng1, lat2, lng2):
"""function to calculate manhattan distance between pick_drop"""
a = haversine_(lat1, lng1, lat1, lng2)
b = haversine_(lat1, lng1, lat2, lng1)
return a + b
import math
def bearing_array(lat1, lng1, lat2, lng2):
""" function was taken from beluga's notebook as this function works on array
while my function used to work on individual elements and was noticably slow"""
AVG_EARTH_RADIUS = 6371 # in km
lng_delta_rad = np.radians(lng2 - lng1)
lat1, lng1, lat2, lng2 = map(np.radians, (lat1, lng1, lat2, lng2))
y = np.sin(lng_delta_rad) * np.cos(lat2)
x = np.cos(lat1) * np.sin(lat2) - np.sin(lat1) * np.cos(lat2) * np.cos(lng_delta_rad)
return np.degrees(np.arctan2(y, x))
#train_data = train_df
train_data.loc[:,'hvsine_pick_drop'] = haversine_(train_data['pickup_latitude'].values, train_data['pickup_longitude'].values, train_data['dropoff_latitude'].values, train_data['dropoff_longitude'].values)
train_data.loc[:,'manhtn_pick_drop'] = manhattan_distance_pd(train_data['pickup_latitude'].values, train_data['pickup_longitude'].values, train_data['dropoff_latitude'].values, train_data['dropoff_longitude'].values)
train_data.loc[:,'bearing'] = bearing_array(train_data['pickup_latitude'].values, train_data['pickup_longitude'].values, train_data['dropoff_latitude'].values, train_data['dropoff_longitude'].values)
train_data.head()
model_df=train_data[['day_of_year', 'drop_day_of_year', 'drop_hour', 'drop_month',
'dropoff_latitude','dropoff_longitude', 'hour', 'pickup_longitude','trip_duration']].copy()
model_df.columns
model_df.to_csv('/Users/as186194/Documents/DOCUMENTS/TRIALS/Kaggle/Kaggle_NYC_Taxi/model_df.csv' , sep=',')
'''from sklearn.model_selection import train_test_split
X = model_df.iloc[:,1:8]
Y = model_df["trip_duration"]
X_train, X_test, Y_train, Y_test = train_test_split(X,Y,test_size = 0.3)'''
models_try=pd.read_csv(INPUT_FOLDER + 'model_df.csv', nrows=5000)
models_try = models_try.drop("Unnamed: 0", axis =1)
models_try.head()
predict = models_try['trip_duration']
predictors = models_try.drop('trip_duration', 1)
clf = RandomForestClassifier(n_jobs=2)
clf.fit(predictors, predict)
clf.score(predictors, predict)
models_test=test_data[['day_of_year','day_of_year', 'hour', 'pick_month',
'dropoff_latitude','dropoff_longitude', 'hour', 'pickup_longitude']].copy()
target= models_try.trip_duration
models_try = models_try.drop('trip_duration',1)
#prediction = clf.predict(models_test)
from sklearn.ensemble import RandomForestRegressor
rf_model = RandomForestRegressor(n_estimators=1000, min_samples_leaf=50, min_samples_split=75)
rf_model.fit(models_try.values, target)
predictions=rf_model.predict(models_test.values)
predictions[:5]
predictions[len(predictions)-5:]
test_df=test_df[:15000]
len(predictions)
test_df['trip_duration'] = predictions
Id=test_df['id']
test_df[['id', 'trip_duration']].to_csv( INPUT_FOLDER + 'arnav.csv.gz', index=False, compression='gzip')
test_df['trip_duration'][:5]
test_df.head()